Problem Statement

https://www.kaggle.com/c/stumbleupon



In [1]:

    
from bs4 import BeautifulSoup
import pandas as pd
pd.options.display.max_columns = 100



In [2]:

    
base_path = "/data/kaggle/evergreen/"
def get_path(filename):
    return base_path + filename



In [3]:

    
df_train = pd.read_csv(get_path("train.tsv"), sep="\t")
df_train.head()









    Out[3]:







  
    
      
      url
      urlid
      boilerplate
      alchemy_category
      alchemy_category_score
      avglinksize
      commonlinkratio_1
      commonlinkratio_2
      commonlinkratio_3
      commonlinkratio_4
      compression_ratio
      embed_ratio
      framebased
      frameTagRatio
      hasDomainLink
      html_ratio
      image_ratio
      is_news
      lengthyLinkDomain
      linkwordscore
      news_front_page
      non_markup_alphanum_characters
      numberOfLinks
      numwords_in_url
      parametrizedLinkRatio
      spelling_errors_ratio
      label
    
  
  
    
      0
      http://www.bloomberg.com/news/2010-12-23/ibm-p...
      4042
      {"title":"IBM Sees Holographic Calls Air Breat...
      business
      0.789131
      2.055556
      0.676471
      0.205882
      0.047059
      0.023529
      0.443783
      0.0
      0
      0.090774
      0
      0.245831
      0.003883
      1
      1
      24
      0
      5424
      170
      8
      0.152941
      0.079130
      0
    
    
      1
      http://www.popsci.com/technology/article/2012-...
      8471
      {"title":"The Fully Electronic Futuristic Star...
      recreation
      0.574147
      3.677966
      0.508021
      0.288770
      0.213904
      0.144385
      0.468649
      0.0
      0
      0.098707
      0
      0.203490
      0.088652
      1
      1
      40
      0
      4973
      187
      9
      0.181818
      0.125448
      1
    
    
      2
      http://www.menshealth.com/health/flu-fighting-...
      1164
      {"title":"Fruits that Fight the Flu fruits tha...
      health
      0.996526
      2.382883
      0.562016
      0.321705
      0.120155
      0.042636
      0.525448
      0.0
      0
      0.072448
      0
      0.226402
      0.120536
      1
      1
      55
      0
      2240
      258
      11
      0.166667
      0.057613
      1
    
    
      3
      http://www.dumblittleman.com/2007/12/10-foolpr...
      6684
      {"title":"10 Foolproof Tips for Better Sleep "...
      health
      0.801248
      1.543103
      0.400000
      0.100000
      0.016667
      0.000000
      0.480725
      0.0
      0
      0.095861
      0
      0.265656
      0.035343
      1
      0
      24
      0
      2737
      120
      5
      0.041667
      0.100858
      1
    
    
      4
      http://bleacherreport.com/articles/1205138-the...
      9006
      {"title":"The 50 Coolest Jerseys You Didn t Kn...
      sports
      0.719157
      2.676471
      0.500000
      0.222222
      0.123457
      0.043210
      0.446143
      0.0
      0
      0.024908
      0
      0.228887
      0.050473
      1
      1
      14
      0
      12032
      162
      10
      0.098765
      0.082569
      0



In [4]:

    
df_test = pd.read_csv(get_path("test.tsv"), sep="\t")
df_test.head()









    Out[4]:







  
    
      
      url
      urlid
      boilerplate
      alchemy_category
      alchemy_category_score
      avglinksize
      commonlinkratio_1
      commonlinkratio_2
      commonlinkratio_3
      commonlinkratio_4
      compression_ratio
      embed_ratio
      framebased
      frameTagRatio
      hasDomainLink
      html_ratio
      image_ratio
      is_news
      lengthyLinkDomain
      linkwordscore
      news_front_page
      non_markup_alphanum_characters
      numberOfLinks
      numwords_in_url
      parametrizedLinkRatio
      spelling_errors_ratio
    
  
  
    
      0
      http://www.lynnskitchenadventures.com/2009/04/...
      5865
      {"title":"Homemade Enchilada Sauce Lynn s Kitc...
      recreation
      0.443906
      2.558140
      0.389706
      0.257353
      0.044118
      0.022059
      0.489572
      0.0
      0
      0.067143
      0
      0.230285
      0.199438
      1
      1
      15
      0
      5643
      136
      3
      0.242647
      0.080597
    
    
      1
      http://lolpics.se/18552-stun-grenade-ar
      782
      {"title":"lolpics Stun grenade ar ","body":" f...
      culture_politics
      0.135844
      3.771429
      0.461538
      0.205128
      0.051282
      0.000000
      0.782051
      0.0
      0
      0.042857
      0
      0.365962
      0.080000
      ?
      1
      62
      0
      382
      39
      2
      0.128205
      0.176471
    
    
      2
      http://www.xcelerationfitness.com/treadmills.html
      6962
      {"title":"Treadmills ","body":" treadmills, st...
      ?
      ?
      2.269565
      0.495726
      0.384615
      0.170940
      0.170940
      1.250000
      0.0
      0
      0.058824
      0
      0.161901
      10.000000
      ?
      1
      42
      0
      2420
      117
      1
      0.581197
      0.125000
    
    
      3
      http://www.bloomberg.com/news/2012-02-06/syria...
      7640
      {"title":"Father s Tactics Used by Assad to Cr...
      culture_politics
      0.90259
      2.523490
      0.705502
      0.346278
      0.122977
      0.090615
      0.449366
      0.0
      0
      0.058081
      0
      0.146593
      0.005964
      1
      1
      41
      0
      5559
      309
      10
      0.038835
      0.063126
    
    
      4
      http://www.wired.com/gadgetlab/2011/12/stem-tu...
      3589
      {"title":"Stem Turns Lemons and Limes Into Jui...
      science_technology
      0.486363
      1.848000
      0.470968
      0.161290
      0.032258
      0.000000
      0.453757
      0.0
      0
      0.093023
      0
      0.244141
      0.035714
      1
      0
      34
      0
      2209
      155
      10
      0.096774
      0.065341



In [5]:

    
df_train.boilerplate[0]









    Out[5]:





'{"title":"IBM Sees Holographic Calls Air Breathing Batteries ibm sees holographic calls, air-breathing batteries","body":"A sign stands outside the International Business Machines Corp IBM Almaden Research Center campus in San Jose California Photographer Tony Avelar Bloomberg Buildings stand at the International Business Machines Corp IBM Almaden Research Center campus in the Santa Teresa Hills of San Jose California Photographer Tony Avelar Bloomberg By 2015 your mobile phone will project a 3 D image of anyone who calls and your laptop will be powered by kinetic energy At least that s what International Business Machines Corp sees in its crystal ball The predictions are part of an annual tradition for the Armonk New York based company which surveys its 3 000 researchers to find five ideas expected to take root in the next five years IBM the world s largest provider of computer services looks to Silicon Valley for input gleaning many ideas from its Almaden research center in San Jose California Holographic conversations projected from mobile phones lead this year s list The predictions also include air breathing batteries computer programs that can tell when and where traffic jams will take place environmental information generated by sensors in cars and phones and cities powered by the heat thrown off by computer servers These are all stretch goals and that s good said Paul Saffo managing director of foresight at the investment advisory firm Discern in San Francisco In an era when pessimism is the new black a little dose of technological optimism is not a bad thing For IBM it s not just idle speculation The company is one of the few big corporations investing in long range research projects and it counts on innovation to fuel growth Saffo said Not all of its predictions pan out though IBM was overly optimistic about the spread of speech technology for instance When the ideas do lead to products they can have broad implications for society as well as IBM s bottom line he said Research Spending They have continued to do research when all the other grand research organizations are gone said Saffo who is also a consulting associate professor at Stanford University IBM invested 5 8 billion in research and development last year 6 1 percent of revenue While that s down from about 10 percent in the early 1990s the company spends a bigger share on research than its computing rivals Hewlett Packard Co the top maker of personal computers spent 2 4 percent last year At Almaden scientists work on projects that don t always fit in with IBM s computer business The lab s research includes efforts to develop an electric car battery that runs 500 miles on one charge a filtration system for desalination and a program that shows changes in geographic data IBM rose 9 cents to 146 04 at 11 02 a m in New York Stock Exchange composite trading The stock had gained 11 percent this year before today Citizen Science The list is meant to give a window into the company s innovation engine said Josephine Cheng a vice president at IBM s Almaden lab All this demonstrates a real culture of innovation at IBM and willingness to devote itself to solving some of the world s biggest problems she said Many of the predictions are based on projects that IBM has in the works One of this year s ideas that sensors in cars wallets and personal devices will give scientists better data about the environment is an expansion of the company s citizen science initiative Earlier this year IBM teamed up with the California State Water Resources Control Board and the City of San Jose Environmental Services to help gather information about waterways Researchers from Almaden created an application that lets smartphone users snap photos of streams and creeks and report back on conditions The hope is that these casual observations will help local and state officials who don t have the resources to do the work themselves Traffic Predictors IBM also sees data helping shorten commutes in the next five years Computer programs will use algorithms and real time traffic information to predict which roads will have backups and how to avoid getting stuck Batteries may last 10 times longer in 2015 than today IBM says Rather than using the current lithium ion technology new models could rely on energy dense metals that only need to interact with the air to recharge Some electronic devices might ditch batteries altogether and use something similar to kinetic wristwatches which only need to be shaken to generate a charge The final prediction involves recycling the heat generated by computers and data centers Almost half of the power used by data centers is currently spent keeping the computers cool IBM scientists say it would be better to harness that heat to warm houses and offices In IBM s first list of predictions compiled at the end of 2006 researchers said instantaneous speech translation would become the norm That hasn t happened yet While some programs can quickly translate electronic documents and instant messages and other apps can perform limited speech translation there s nothing widely available that acts like the universal translator in Star Trek Second Life The company also predicted that online immersive environments such as Second Life would become more widespread While immersive video games are as popular as ever Second Life s growth has slowed Internet users are flocking instead to the more 2 D environments of Facebook Inc and Twitter Inc Meanwhile a 2007 prediction that mobile phones will act as a wallet ticket broker concierge bank and shopping assistant is coming true thanks to the explosion of smartphone applications Consumers can pay bills through their banking apps buy movie tickets and get instant feedback on potential purchases all with a few taps on their phones The nice thing about the list is that it provokes thought Saffo said If everything came true they wouldn t be doing their job To contact the reporter on this story Ryan Flinn in San Francisco at rflinn bloomberg net To contact the editor responsible for this story Tom Giles at tgiles5 bloomberg net by 2015, your mobile phone will project a 3-d image of anyone who calls and your laptop will be powered by kinetic energy. at least that\\u2019s what international business machines corp. sees in its crystal ball.","url":"bloomberg news 2010 12 23 ibm predicts holographic calls air breathing batteries by 2015 html"}'



In [6]:

    
df_train.label.value_counts()/len(df_train)









    Out[6]:





1    0.51332
0    0.48668
Name: label, dtype: float64



In [8]:

    
import json, re



In [9]:

    
def preprocess(boilerplate):
    d = json.loads(boilerplate)
    body = d["body"]
    if body is not None:
        # Remove html tags
        text = BeautifulSoup(body.lower(), "html5lib").text 
        # Replace the occurrences of multiple consecutive whilespaces 
        # with a single space (" ")
        text = re.sub(r"[\W]+", " ", text)
        return text
    return ""

preprocess(df_train.boilerplate[0])









    Out[9]:





'a sign stands outside the international business machines corp ibm almaden research center campus in san jose california photographer tony avelar bloomberg buildings stand at the international business machines corp ibm almaden research center campus in the santa teresa hills of san jose california photographer tony avelar bloomberg by 2015 your mobile phone will project a 3 d image of anyone who calls and your laptop will be powered by kinetic energy at least that s what international business machines corp sees in its crystal ball the predictions are part of an annual tradition for the armonk new york based company which surveys its 3 000 researchers to find five ideas expected to take root in the next five years ibm the world s largest provider of computer services looks to silicon valley for input gleaning many ideas from its almaden research center in san jose california holographic conversations projected from mobile phones lead this year s list the predictions also include air breathing batteries computer programs that can tell when and where traffic jams will take place environmental information generated by sensors in cars and phones and cities powered by the heat thrown off by computer servers these are all stretch goals and that s good said paul saffo managing director of foresight at the investment advisory firm discern in san francisco in an era when pessimism is the new black a little dose of technological optimism is not a bad thing for ibm it s not just idle speculation the company is one of the few big corporations investing in long range research projects and it counts on innovation to fuel growth saffo said not all of its predictions pan out though ibm was overly optimistic about the spread of speech technology for instance when the ideas do lead to products they can have broad implications for society as well as ibm s bottom line he said research spending they have continued to do research when all the other grand research organizations are gone said saffo who is also a consulting associate professor at stanford university ibm invested 5 8 billion in research and development last year 6 1 percent of revenue while that s down from about 10 percent in the early 1990s the company spends a bigger share on research than its computing rivals hewlett packard co the top maker of personal computers spent 2 4 percent last year at almaden scientists work on projects that don t always fit in with ibm s computer business the lab s research includes efforts to develop an electric car battery that runs 500 miles on one charge a filtration system for desalination and a program that shows changes in geographic data ibm rose 9 cents to 146 04 at 11 02 a m in new york stock exchange composite trading the stock had gained 11 percent this year before today citizen science the list is meant to give a window into the company s innovation engine said josephine cheng a vice president at ibm s almaden lab all this demonstrates a real culture of innovation at ibm and willingness to devote itself to solving some of the world s biggest problems she said many of the predictions are based on projects that ibm has in the works one of this year s ideas that sensors in cars wallets and personal devices will give scientists better data about the environment is an expansion of the company s citizen science initiative earlier this year ibm teamed up with the california state water resources control board and the city of san jose environmental services to help gather information about waterways researchers from almaden created an application that lets smartphone users snap photos of streams and creeks and report back on conditions the hope is that these casual observations will help local and state officials who don t have the resources to do the work themselves traffic predictors ibm also sees data helping shorten commutes in the next five years computer programs will use algorithms and real time traffic information to predict which roads will have backups and how to avoid getting stuck batteries may last 10 times longer in 2015 than today ibm says rather than using the current lithium ion technology new models could rely on energy dense metals that only need to interact with the air to recharge some electronic devices might ditch batteries altogether and use something similar to kinetic wristwatches which only need to be shaken to generate a charge the final prediction involves recycling the heat generated by computers and data centers almost half of the power used by data centers is currently spent keeping the computers cool ibm scientists say it would be better to harness that heat to warm houses and offices in ibm s first list of predictions compiled at the end of 2006 researchers said instantaneous speech translation would become the norm that hasn t happened yet while some programs can quickly translate electronic documents and instant messages and other apps can perform limited speech translation there s nothing widely available that acts like the universal translator in star trek second life the company also predicted that online immersive environments such as second life would become more widespread while immersive video games are as popular as ever second life s growth has slowed internet users are flocking instead to the more 2 d environments of facebook inc and twitter inc meanwhile a 2007 prediction that mobile phones will act as a wallet ticket broker concierge bank and shopping assistant is coming true thanks to the explosion of smartphone applications consumers can pay bills through their banking apps buy movie tickets and get instant feedback on potential purchases all with a few taps on their phones the nice thing about the list is that it provokes thought saffo said if everything came true they wouldn t be doing their job to contact the reporter on this story ryan flinn in san francisco at rflinn bloomberg net to contact the editor responsible for this story tom giles at tgiles5 bloomberg net by 2015 your mobile phone will project a 3 d image of anyone who calls and your laptop will be powered by kinetic energy at least that s what international business machines corp sees in its crystal ball '



In [10]:

    
%%time 
df_train["body"]= df_train.boilerplate.apply(preprocess)









    



CPU times: user 5.32 s, sys: 18.4 ms, total: 5.34 s
Wall time: 5.34 s



In [11]:

    
%%time 
df_test["body"]= df_test.boilerplate.apply(preprocess)









    



CPU times: user 2.3 s, sys: 8.74 ms, total: 2.31 s
Wall time: 2.31 s



In [12]:

    
import nltk



In [13]:

    
def my_tokenizer(s):
    porter = nltk.stem.porter.PorterStemmer()
    terms = [porter.stem(w) for w in nltk.word_tokenize(s)]
    terms = [term for term in terms if len(term) > 2]
    return terms
my_tokenizer("In Yellowstone National Park, warming has brought rapid changes.")









    Out[13]:





['yellowston', 'nation', 'park', 'warm', 'brought', 'rapid', 'chang']



In [14]:

    
from sklearn import feature_extraction



In [15]:

    
stopwords = nltk.corpus.stopwords.words("english")
tfidf = feature_extraction.text.TfidfVectorizer( 
                      tokenizer=my_tokenizer
                    , stop_words = stopwords
                    , ngram_range=(1, 1))



In [16]:

    
%%time 
body_train_tfidf = tfidf.fit_transform(df_train.body)









    



CPU times: user 51 s, sys: 47.1 ms, total: 51 s
Wall time: 51 s



In [17]:

    
%%time 
body_test_tfidf = tfidf.transform(df_test.body)









    



CPU times: user 21.3 s, sys: 13.5 ms, total: 21.3 s
Wall time: 21.3 s



In [18]:

    
body_train_tfidf.shape, body_test_tfidf.shape









    Out[18]:





((7395, 65284), (3171, 65284))



In [19]:

    
type(body_train_tfidf)









    Out[19]:





scipy.sparse.csr.csr_matrix



In [20]:

    
df_train.columns









    Out[20]:





Index(['url', 'urlid', 'boilerplate', 'alchemy_category',
       'alchemy_category_score', 'avglinksize', 'commonlinkratio_1',
       'commonlinkratio_2', 'commonlinkratio_3', 'commonlinkratio_4',
       'compression_ratio', 'embed_ratio', 'framebased', 'frameTagRatio',
       'hasDomainLink', 'html_ratio', 'image_ratio', 'is_news',
       'lengthyLinkDomain', 'linkwordscore', 'news_front_page',
       'non_markup_alphanum_characters', 'numberOfLinks', 'numwords_in_url',
       'parametrizedLinkRatio', 'spelling_errors_ratio', 'label', 'body'],
      dtype='object')



In [27]:

    
columns = ['avglinksize', 'commonlinkratio_1',
       'commonlinkratio_2', 'commonlinkratio_3', 'commonlinkratio_4',
       'compression_ratio', 'embed_ratio', 'framebased', 'frameTagRatio',
       'hasDomainLink', 'html_ratio', 'image_ratio', 'is_news',
       'lengthyLinkDomain', 'linkwordscore', 
       'non_markup_alphanum_characters', 'numberOfLinks', 'numwords_in_url',
       'parametrizedLinkRatio', 'spelling_errors_ratio'] 
#'news_front_page', 'alchemy_category_score'
X = pd.concat([df_train, df_test])[columns]
X = pd.get_dummies(X, drop_first=True)
print(X.shape)

X_train  = X.iloc[:len(df_train), :]
X_test  = X.iloc[len(df_train):, :]
X_train.shape, X_test.shape









    



(10566, 20)






    



/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:9: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  if __name__ == '__main__':






    Out[27]:





((7395, 20), (3171, 20))



In [28]:

    
from sklearn import preprocessing



In [29]:

    
import scipy as sp
import numpy as np



In [30]:

    
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



In [31]:

    
X_train = sp.sparse.hstack((X_train, body_train_tfidf))
X_test = sp.sparse.hstack((X_test, body_test_tfidf))



In [32]:

    
X_train.shape, X_test.shape









    Out[32]:





((7395, 65304), (3171, 65304))



In [34]:

    
label_encoder = preprocessing.LabelEncoder()
y_train = label_encoder.fit_transform(df_train.label)
#y_test = label_encoder.transform(df_test.label)



In [36]:

    
from sklearn import linear_model, metrics, model_selection



In [69]:

    
%%time
X1, X2, y1, y2 = model_selection.train_test_split(X_train, y_train, test_size = 0.3, random_state = 1)
lr = linear_model.LogisticRegression(C = 1.0, random_state = 1, max_iter=10000
                            , n_jobs = 12, solver="saga")
lr.fit(X1, y1)
y2_pred = lr.predict(X2)
print("Accuracy: ", metrics.accuracy_score(y2, y2_pred))









    



Accuracy:  0.7945020279405137
CPU times: user 13.4 s, sys: 16.2 ms, total: 13.4 s
Wall time: 12.5 s

Submission



In [68]:

    
%%time
lr = linear_model.LogisticRegression(random_state = 1, max_iter=5000
                            , n_jobs = 12, solver="saga")
lr.fit(X_train, y_train)
y_test_pred = lr.predict(X_test)









    



CPU times: user 19.4 s, sys: 37.1 ms, total: 19.5 s
Wall time: 18.5 s



In [48]:

    
submission = pd.DataFrame({"urlid": df_test.urlid, "label": y_test_pred})
submission.sample(10)



In [50]:

    
submission.to_csv("/tmp/submission.csv", index=False)



In [ ]:

	urlid	label
1712	5326	1
1951	2600	0
3072	8167	0
732	10492	1
1580	1873	0
2286	5480	0
3134	892	1
3038	4382	1
905	6172	1
268	4523	1

	url	urlid	boilerplate	alchemy_category	alchemy_category_score	avglinksize	commonlinkratio_1	commonlinkratio_2	commonlinkratio_3	commonlinkratio_4	compression_ratio	frameTagRatio	html_ratio	image_ratio	is_news	lengthyLinkDomain	linkwordscore	non_markup_alphanum_characters	numberOfLinks	numwords_in_url	parametrizedLinkRatio	spelling_errors_ratio	label
0	http://www.bloomberg.com/news/2010-12-23/ibm-p...	4042	{"title":"IBM Sees Holographic Calls Air Breat...	business	0.789131	2.055556	0.676471	0.205882	0.047059	0.023529	0.443783	0.090774	0.245831	0.003883	1	1	24	5424	170	8	0.152941	0.079130	0
1	http://www.popsci.com/technology/article/2012-...	8471	{"title":"The Fully Electronic Futuristic Star...	recreation	0.574147	3.677966	0.508021	0.288770	0.213904	0.144385	0.468649	0.098707	0.203490	0.088652	1	1	40	4973	187	9	0.181818	0.125448	1
2	http://www.menshealth.com/health/flu-fighting-...	1164	{"title":"Fruits that Fight the Flu fruits tha...	health	0.996526	2.382883	0.562016	0.321705	0.120155	0.042636	0.525448	0.072448	0.226402	0.120536	1	1	55	2240	258	11	0.166667	0.057613	1
3	http://www.dumblittleman.com/2007/12/10-foolpr...	6684	{"title":"10 Foolproof Tips for Better Sleep "...	health	0.801248	1.543103	0.400000	0.100000	0.016667	0.000000	0.480725	0.095861	0.265656	0.035343	1	0	24	2737	120	5	0.041667	0.100858	1
4	http://bleacherreport.com/articles/1205138-the...	9006	{"title":"The 50 Coolest Jerseys You Didn t Kn...	sports	0.719157	2.676471	0.500000	0.222222	0.123457	0.043210	0.446143	0.024908	0.228887	0.050473	1	1	14	12032	162	10	0.098765	0.082569	0

	url	urlid	boilerplate	alchemy_category	alchemy_category_score	avglinksize	commonlinkratio_1	commonlinkratio_2	commonlinkratio_3	commonlinkratio_4	compression_ratio	frameTagRatio	html_ratio	image_ratio	is_news	lengthyLinkDomain	linkwordscore	non_markup_alphanum_characters	numberOfLinks	numwords_in_url	parametrizedLinkRatio	spelling_errors_ratio
0	http://www.lynnskitchenadventures.com/2009/04/...	5865	{"title":"Homemade Enchilada Sauce Lynn s Kitc...	recreation	0.443906	2.558140	0.389706	0.257353	0.044118	0.022059	0.489572	0.067143	0.230285	0.199438	1	1	15	5643	136	3	0.242647	0.080597
1	http://lolpics.se/18552-stun-grenade-ar	782	{"title":"lolpics Stun grenade ar ","body":" f...	culture_politics	0.135844	3.771429	0.461538	0.205128	0.051282	0.000000	0.782051	0.042857	0.365962	0.080000	?	1	62	382	39	2	0.128205	0.176471
2	http://www.xcelerationfitness.com/treadmills.html	6962	{"title":"Treadmills ","body":" treadmills, st...	?	?	2.269565	0.495726	0.384615	0.170940	0.170940	1.250000	0.058824	0.161901	10.000000	?	1	42	2420	117	1	0.581197	0.125000
3	http://www.bloomberg.com/news/2012-02-06/syria...	7640	{"title":"Father s Tactics Used by Assad to Cr...	culture_politics	0.90259	2.523490	0.705502	0.346278	0.122977	0.090615	0.449366	0.058081	0.146593	0.005964	1	1	41	5559	309	10	0.038835	0.063126
4	http://www.wired.com/gadgetlab/2011/12/stem-tu...	3589	{"title":"Stem Turns Lemons and Limes Into Jui...	science_technology	0.486363	1.848000	0.470968	0.161290	0.032258	0.000000	0.453757	0.093023	0.244141	0.035714	1	0	34	2209	155	10	0.096774	0.065341